// Program for computing the transpose of an 8x8 bit matrix.
// Max line length is 57, to fit in hacker.book.
// (Well, we cheat a little and two lines are of length 58.)
// This has been tested on both AIX/xlc and Windows/gcc, and
// so is believed to be independent of endian mode and whether
// char is signed or unsigned.
#include <stdio.h>
#include <stdlib.h>     // To define "exit", req'd by XLC.
#include <string.h>

void transpose8a(unsigned char A[8], int m, int n, unsigned char B[8]) {
   unsigned char a0, a1, a2, a3, a4, a5, a6, a7,
                 b0, b1, b2, b3, b4, b5, b6, b7;

   // Load the array into eight one-byte variables.

   a0 = A[0];  a1 = A[m];  a2 = A[2*m];  a3 = A[3*m];
   a4 = A[4*m];  a5 = A[5*m];  a6 = A[6*m];  a7 = A[7*m];

// ------------------------------ cut ----------------------------------
   b0 = (a0 & 128)    | (a1 & 128)/2  | (a2 & 128)/4  | (a3 & 128)/8 |
        (a4 & 128)/16 | (a5 & 128)/32 | (a6 & 128)/64 | (a7      )/128;
   b1 = (a0 &  64)*2  | (a1 &  64)    | (a2 &  64)/2  | (a3 &  64)/4 |
        (a4 &  64)/8  | (a5 &  64)/16 | (a6 &  64)/32 | (a7 &  64)/64;
   b2 = (a0 &  32)*4  | (a1 &  32)*2  | (a2 &  32)    | (a3 &  32)/2 |
        (a4 &  32)/4  | (a5 &  32)/8  | (a6 &  32)/16 | (a7 &  32)/32;
   b3 = (a0 &  16)*8  | (a1 &  16)*4  | (a2 &  16)*2  | (a3 &  16)   |
        (a4 &  16)/2  | (a5 &  16)/4  | (a6 &  16)/8  | (a7 &  16)/16;
   b4 = (a0 &   8)*16 | (a1 &   8)*8  | (a2 &   8)*4  | (a3 &   8)*2 |
        (a4 &   8)    | (a5 &   8)/2  | (a6 &   8)/4  | (a7 &   8)/8;
   b5 = (a0 &   4)*32 | (a1 &   4)*16 | (a2 &   4)*8  | (a3 &   4)*4 |
        (a4 &   4)*2  | (a5 &   4)    | (a6 &   4)/2  | (a7 &   4)/4;
   b6 = (a0 &   2)*64 | (a1 &   2)*32 | (a2 &   2)*16 | (a3 &   2)*8 |
        (a4 &   2)*4  | (a5 &   2)*2  | (a6 &   2)    | (a7 &   2)/2;
   b7 = (a0      )*128| (a1 &   1)*64 | (a2 &   1)*32 | (a3 &   1)*16|
        (a4 &   1)*8  | (a5 &   1)*4  | (a6 &   1)*2  | (a7 &   1);
// ---------------------------- end cut --------------------------------

   B[0] = b0;    B[n] = b1;    B[2*n] = b2;  B[3*n] = b3;
   B[4*n] = b4;  B[5*n] = b5;  B[6*n] = b6;  B[7*n] = b7;
}

// Decided not to include the procedure below in HD.
// Too similar to transpose8c, which is a little better (probably).

void transpose8b(unsigned char A[8], int m, int n,
                unsigned char B[8]) {
   unsigned x, y, t;

   // Load the array and pack it into x and y.

   x = (A[0]<<24)   | (A[m]<<16)   | (A[2*m]<<8) | A[3*m];
   y = (A[4*m]<<24) | (A[5*m]<<16) | (A[6*m]<<8) | A[7*m];

   x = (x & 0xAA55AA55) | ((x & 0x00AA00AA) << 7) |
       ((x >> 7) & 0x00AA00AA);
   y = (y & 0xAA55AA55) | ((y & 0x00AA00AA) << 7) |
       ((y >> 7) & 0x00AA00AA);

   x = (x & 0xCCCC3333) | ((x & 0x0000CCCC) << 14) |
       ((x >> 14) & 0x0000CCCC);
   y = (y & 0xCCCC3333) | ((y & 0x0000CCCC) << 14) |
       ((y >> 14) & 0x0000CCCC);

   t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);
   y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);
   x = t;

   B[0]=x>>24;    B[n]=x>>16;    B[2*n]=x>>8;  B[3*n]=x;
   B[4*n]=y>>24;  B[5*n]=y>>16;  B[6*n]=y>>8;  B[7*n]=y;
}

// ------------------------------ cut ----------------------------------
void transpose8c(unsigned char A[8], int m, int n,
                unsigned char B[8]) {
   unsigned x, y, t;

   // Load the array and pack it into x and y.

   x = (A[0]<<24)   | (A[m]<<16)   | (A[2*m]<<8) | A[3*m];
   y = (A[4*m]<<24) | (A[5*m]<<16) | (A[6*m]<<8) | A[7*m];

   t = (x ^ (x >> 7)) & 0x00AA00AA;  x = x ^ t ^ (t << 7);
   t = (y ^ (y >> 7)) & 0x00AA00AA;  y = y ^ t ^ (t << 7);

   t = (x ^ (x >>14)) & 0x0000CCCC;  x = x ^ t ^ (t <<14);
   t = (y ^ (y >>14)) & 0x0000CCCC;  y = y ^ t ^ (t <<14);

   t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);
   y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);
   x = t;

   B[0]=x>>24;    B[n]=x>>16;    B[2*n]=x>>8;  B[3*n]=x;
   B[4*n]=y>>24;  B[5*n]=y>>16;  B[6*n]=y>>8;  B[7*n]=y;
}
// ---------------------------- end cut --------------------------------

// transpose8d is transpose8c done "backwards."

void transpose8d(unsigned char A[8], int m, int n,
               unsigned char B[8]) {
   unsigned x, y, t;

   // Load the array and pack it into x and y.

   x = (A[0]<<24)   | (A[m]<<16)   | (A[2*m]<<8) | A[3*m];
   y = (A[4*m]<<24) | (A[5*m]<<16) | (A[6*m]<<8) | A[7*m];

   t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);
   y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);
   x = t;

   t = (x ^ (x >>14)) & 0x0000CCCC;  x = x ^ t ^ (t <<14);
   t = (y ^ (y >>14)) & 0x0000CCCC;  y = y ^ t ^ (t <<14);

   t = (x ^ (x >> 7)) & 0x00AA00AA;  x = x ^ t ^ (t << 7);
   t = (y ^ (y >> 7)) & 0x00AA00AA;  y = y ^ t ^ (t << 7);

   B[0]=x>>24;    B[n]=x>>16;    B[2*n]=x>>8;  B[3*n]=x;
   B[4*n]=y>>24;  B[5*n]=y>>16;  B[6*n]=y>>8;  B[7*n]=y;
}

int errors;
void error(unsigned char T[8], unsigned char R[8]) {
   errors = errors + 1;
   printf("Error for test = %02x%02x%02x%02x %02x%02x%02x%02x, got "
      "%02x%02x%02x%02x %02x%02x%02x%02x\n",
      T[0],T[1],T[2],T[3],T[4],T[5],T[6],T[7],
      R[0],R[1],R[2],R[3],R[4],R[5],R[6],R[7]);

}

int main() {
   int i, n;
   static unsigned char test[][8] = {
      {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}, {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},
      {0x80,0x40,0x20,0x10,0x08,0x04,0x02,0x01}, {0x80,0x40,0x20,0x10,0x08,0x04,0x02,0x01},
      {0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00}, {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x10},
      {0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x00}, {0x00,0x00,0x00,0x00,0x00,0x00,0x10,0x00},
      {0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00}, {0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x00},
      {0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x00}, {0x00,0x00,0x00,0x00,0x10,0x00,0x00,0x00},
      {0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55}, {0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF},
      {0xAA,0xAA,0xAA,0xAA,0xAA,0xAA,0xAA,0xAA}, {0xFF,0x00,0xFF,0x00,0xFF,0x00,0xFF,0x00},
      {0x7F,0x3F,0x1F,0x0F,0x07,0x03,0x01,0x00}, {0x00,0x80,0xC0,0xE0,0xF0,0xF8,0xFC,0xFE},
      {0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00}, {0xF0,0xF0,0xF0,0xF0,0xF0,0xF0,0xF0,0xF0},
      {0x00,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF}, {0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F},
      {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF}, {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF},
   };

   unsigned char result[8];
   n = sizeof(test)/8;

   printf("transpose8a:\n");
   for (i = 0; i < n; i += 2) {
      transpose8a(test[i], 1, 1, result);
      if (memcmp((char *)test[i+1], (char *)result, 8) != 0)
         error(test[i], result);

      transpose8a(test[i+1], 1, 1, result);
      if (memcmp((char *)test[i], (char *)result, 8) != 0)
         error(test[i+1], result);
   }

   printf("transpose8b:\n");
   for (i = 0; i < n; i += 2) {
      transpose8b(test[i], 1, 1, result);
      if (memcmp((char *)test[i+1], (char *)result, 8) != 0)
         error(test[i], result);

      transpose8b(test[i+1], 1, 1, result);
      if (memcmp((char *)test[i], (char *)result, 8) != 0)
         error(test[i+1], result);
   }

   printf("transpose8c:\n");
   for (i = 0; i < n; i += 2) {
      transpose8c(test[i], 1, 1, result);
      if (memcmp((char *)test[i+1], (char *)result, 8) != 0)
         error(test[i], result);

      transpose8c(test[i+1], 1, 1, result);
      if (memcmp((char *)test[i], (char *)result, 8) != 0)
         error(test[i+1], result);
   }

   printf("transpose8d:\n");
   for (i = 0; i < n; i += 2) {
      transpose8d(test[i], 1, 1, result);
      if (memcmp((char *)test[i+1], (char *)result, 8) != 0)
         error(test[i], result);

      transpose8d(test[i+1], 1, 1, result);
      if (memcmp((char *)test[i], (char *)result, 8) != 0)
         error(test[i+1], result);
   }

   if (errors == 0)
      printf("Passed all %d cases and their reversals.\n", n/2);
}
